© 2023, Scripnic Dinu, all rights reserved ## 3.1 Mobile phone picture
# set working directory to the location of the script
if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}
# load libraries
library(data.table)
library(magick)
## Warning: package 'magick' was built under R version 4.2.3
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(cluster)
library(imager)
## Warning: package 'imager' was built under R version 4.2.3
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:imager':
##
## highlight
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.2.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(FNN)
## Warning: package 'FNN' was built under R version 4.2.3
img <- imager::load.image("./img.jpg")
img <- imager::resize(img, size_x = 256, size_y = 256)
plot(img)
pixels <- as.data.frame(img, wide="c")
setDT(pixels)
wss <- sapply(1:10, function(k) kmeans(pixels, k)$tot.withinss)
plot(1:10, wss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")
### Conclusion * The optimal number of clusters is 4
best_k <- 4
km <- kmeans(pixels, centers = best_k)
dt_newimg <- data.table(
x = pixels[, x],
y = pixels[, y],
R = km$centers[km$cluster, "c.1"],
G = km$centers[km$cluster, "c.2"],
B = km$centers[km$cluster, "c.3"])
plot_ly(data = dt_newimg,
x = ~x,
y = ~y,
type = "scattergl",
mode = "markers",
marker = list(color = ~rgb(R, G, B))) |>
layout(yaxis = list(autorange = "reversed", scaleanchor = "x", scaleratio = 1))
# fviz_cluster(km, data = dt_rgb) # if there are more than 2 dim, it uses PCA
data <- read.csv("./drilling.csv")
summary(data)
## x y
## Min. :-0.04969 Min. :-0.05967
## 1st Qu.: 0.19177 1st Qu.: 0.18619
## Median : 0.40684 Median : 0.40299
## Mean : 0.41582 Mean : 0.45929
## 3rd Qu.: 0.65068 3rd Qu.: 0.73264
## Max. : 0.88515 Max. : 1.06394
# do a scatterplot
plot(data$x, data$y, type="p")
# feedData is columns x and y
feedData <- data[, c("x", "y")]
wss <- sapply(1:10, function(k) kmeans(feedData, k)$tot.withinss)
plot(1:10, wss, type = "b", xlab = "Number of clusters", ylab = "Within-cluster sum of squares")
### Conclusion * The optimal number of clusters is 4
best_k <- 4
kmeans_model <- kmeans(feedData, centers = best_k)
# add the cluster to the data
data$kmeans_cluster <- kmeans_model$cluster
plot(data$x, data$y, type="p", col=data$kmeans_cluster)
### 3.2.3 Hierarchical clustering
# library agnes
linkage_methods <- c("average", "single", "complete", "ward")
hcl <- list()
for (linkage in linkage_methods) {
hcl[[linkage]] <- agnes(feedData, method = linkage)
}
plot(hcl$average, which.plots = 2, main = "Average linkage")
plot(hcl$single, which.plots = 2, main = "Single linkage")
plot(hcl$complete, which.plots = 2, main = "Complete linkage")
plot(hcl$ward, which.plots = 2, main = "Ward linkage")
best_linkage <- "ward"
hcl_model <- agnes(feedData, method = best_linkage)
plot(hcl$ward, which.plots = 2, main = "Ward linkage")
abline(h = 1.5, col = "red", lwd = 3)
# we can cut at 1.5 to have 4 clusters
tree <- cutree(hcl_model, k=4)
data$hcl_cluster <- tree
plot(data$x, data$y, type="p", col=data$hcl_cluster)
# download the library
library(FNN)
minPts <- 4
# 1. Get the distance using KNN
knn_dist <- knn.dist(feedData, k = minPts)
# 2. Sort the distances in an ascending order
knn_dist <- sort(knn_dist, decreasing = FALSE)
# 3. Plot the distances
plot(knn_dist)
# deaw a line at eps = 0.06, very thick
abline(h = 0.06, col = "red", lwd = 3)
# 4. Identify the value where the curve has the biggest change
eps <- 0.06
library(dbscan)
dbscan_model <- dbscan(feedData, eps = eps, minPts = minPts)
data$dbscan_cluster <- dbscan_model$cluster
plot(data$x, data$y, type="p", col=data$dbscan_cluster)